Source code for nlp_architect.models.cross_doc_coref.system.cdc_utils

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import logging
import os
from typing import List

from nlp_architect.common.cdc.cluster import Clusters
from nlp_architect.common.cdc.mention_data import MentionData
from nlp_architect.common.cdc.topics import Topic
from nlp_architect.utils.string_utils import StringUtils

logger = logging.getLogger(__name__)


[docs]def write_clusters_to_file(clusters: Clusters, topic_id: str, file_obj) -> None: """ Write the clusters to a text file (for experiments or evaluation using coreference scorer (v8.01)) Args: clusters: the cluster to write topic_id: file_obj: file object """ i = 0 file_obj.write('Topic - ' + topic_id + '\n') for cluster in clusters.clusters_list: i += 1 file_obj.write('cluster #' + str(i) + '\n') mentions_list = [] for mention in cluster.mentions: mentions_list.append((mention.tokens_str, mention.predicted_coref_chain)) file_obj.write(str(mentions_list) + '\n')
[docs]def extract_vocab(mentions: List[MentionData], filter_stop_words: bool) -> List[str]: """ Extract Head, Lemma and mention string from all mentions to create a list of string vocabulary Args: mentions: filter_stop_words: Returns: """ vocab = set() for mention in mentions: head = mention.mention_head head_lemma = mention.mention_head_lemma tokens_str = mention.tokens_str if not filter_stop_words: vocab.add(head) vocab.add(head_lemma) vocab.add(tokens_str) else: if not StringUtils.is_stop(head): vocab.add(head) if not StringUtils.is_stop(head_lemma): vocab.add(head_lemma) if not StringUtils.is_stop(tokens_str): vocab.add(tokens_str) vocab_set = list(vocab) return vocab_set
[docs]def load_mentions_vocab_from_files(mentions_files, filter_stop_words=False): logger.info('Loading mentions files...') mentions = [] for _file in mentions_files: mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file)) return load_mentions_vocab(mentions, filter_stop_words)
[docs]def load_mentions_vocab(mentions, filter_stop_words=False): vocab = extract_vocab(mentions, filter_stop_words) logger.info('Done loading mentions files...') return vocab
[docs]def write_event_coref_scorer_results(topics_list: List[Topic], output_file: str) -> None: with open(os.path.join(output_file, 'cd_event_pred_clusters_spans.txt'), 'w') as output: write_topics(topics_list, output)
[docs]def write_entity_coref_scorer_results(topics_list: List[Topic], output_file: str) -> None: with open(os.path.join(output_file, 'cd_entity_pred_clusters_spans.txt'), 'w') as output: write_topics(topics_list, output)
[docs]def write_topics(topics_list: List[Topic], output) -> None: output.write('#begin document (ECB+/ecbplus_all); part 000\n') for topic in topics_list: for mention in topic.mentions: output.write('ECB+/ecbplus_all\t' + '(' + str(mention.predicted_coref_chain) + ')\n') output.write('#end document')